static const char* radixSortAdvancedKernelsDX11= \
"/*\n"
"		2011 Takahiro Harada\n"
"*/\n"
"\n"
"typedef uint u32;\n"
"\n"
"#define GET_GROUP_IDX groupIdx.x\n"
"#define GET_LOCAL_IDX localIdx.x\n"
"#define GET_GLOBAL_IDX globalIdx.x\n"
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
"#define AtomInc(x) InterlockedAdd(x, 1)\n"
"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
"\n"
"#define min2 min\n"
"#define max2 max\n"
"\n"
"\n"
"cbuffer CB0 : register( b0 )\n"
"{\n"
"	int m_startBit;\n"
"	int m_totalBlocks;\n"
"	int m_nWorkGroupsToExecute;\n"
"	int m_nBlocksPerGroup;\n"
"\n"
"};\n"
"\n"
"\n"
"typedef struct {\n"
"    unsigned int key;\n"
"    unsigned int value;\n"
"} KeyValuePair;\n"
"\n"
"\n"
"StructuredBuffer<u32> rHistogram : register(t0);\n"
"\n"
"RWStructuredBuffer<KeyValuePair> dataToSort : register( u0 );\n"
"RWStructuredBuffer<KeyValuePair> dataToSortOut : register( u1 );\n"
"\n"
"\n"
"\n"
"#define WG_SIZE 128\n"
"#define ELEMENTS_PER_WORK_ITEM 4\n"
"#define BITS_PER_PASS 4\n"
"#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
"\n"
"\n"
"groupshared u32 sorterSharedMemory[max(WG_SIZE*2*2, WG_SIZE*ELEMENTS_PER_WORK_ITEM*2)];\n"
"groupshared u32 localHistogramToCarry[NUM_BUCKET];\n"
"groupshared u32 localHistogram[NUM_BUCKET*2];\n"
"groupshared u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
"groupshared u32 localPrefixSum[NUM_BUCKET];\n"
"\n"
"\n"
"\n"
"#define SET_LOCAL_SORT_DATA(idx, sortDataIn) sorterSharedMemory[2*(idx)+0] = sortDataIn.key; sorterSharedMemory[2*(idx)+1] = sortDataIn.value; \n"
"#define GET_LOCAL_SORT_DATA(idx, sortDataOut) sortDataOut.key = sorterSharedMemory[2*(idx)+0]; sortDataOut.value = sorterSharedMemory[2*(idx)+1];\n"
"\n"
"\n"
"\n"
"uint4 prefixScanVector( uint4 data )\n"
"{\n"
"	data.y += data.x;\n"
"	data.w += data.z;\n"
"	data.z += data.y;\n"
"	data.w += data.y;\n"
"	return data;\n"
"}\n"
"\n"
"uint prefixScanVectorEx( inout uint4 data )\n"
"{\n"
"	uint4 backup = data;\n"
"	data.y += data.x;\n"
"	data.w += data.z;\n"
"	data.z += data.y;\n"
"	data.w += data.y;\n"
"	uint sum = data.w;\n"
"	data -= backup;\n"
"	return sum;\n"
"}\n"
"\n"
"uint localPrefixScan128( uint pData, uint lIdx, inout uint totalSum )\n"
"{\n"
"	{	//	Set data\n"
"		sorterSharedMemory[lIdx] = 0;\n"
"		sorterSharedMemory[lIdx+WG_SIZE] = pData;\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	{	//	Prefix sum\n"
"		int idx = 2*lIdx + (WG_SIZE+1);\n"
"		if( lIdx < 64 )\n"
"		{\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];			\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
"		}\n"
"		if( lIdx < 64 ) sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
"	return sorterSharedMemory[lIdx+127];\n"
"}\n"
"\n"
"void localPrefixScan128Dual( uint pData0, uint pData1, uint lIdx, \n"
"							inout uint rank0, inout uint rank1,\n"
"							inout uint totalSum0, inout uint totalSum1 )\n"
"{\n"
"	{	//	Set data\n"
"		sorterSharedMemory[lIdx] = 0;\n"
"		sorterSharedMemory[lIdx+WG_SIZE] = pData0;\n"
"		sorterSharedMemory[2*WG_SIZE+lIdx] = 0;\n"
"		sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1;\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"//	if( lIdx < 128 ) // todo. assert wg size is 128\n"
"	{	//	Prefix sum\n"
"		int blockIdx = lIdx/64;\n"
"		int groupIdx = lIdx%64;\n"
"		int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;\n"
"\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-2];\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
"\n"
"		sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	totalSum0 = sorterSharedMemory[WG_SIZE*2-1];\n"
"	rank0 = sorterSharedMemory[lIdx+127];\n"
"	totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];\n"
"	rank1 = sorterSharedMemory[2*WG_SIZE+lIdx+127];\n"
"}\n"
"\n"
"uint4 localPrefixSum128V( uint4 pData, uint lIdx, inout uint totalSum )\n"
"{\n"
"	{	//	Set data\n"
"		sorterSharedMemory[lIdx] = 0;\n"
"		sorterSharedMemory[lIdx+WG_SIZE] = prefixScanVectorEx( pData );\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	{	//	Prefix sum\n"
"		int idx = 2*lIdx + (WG_SIZE+1);\n"
"		if( lIdx < 64 )\n"
"		{\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-2];					\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-8];			\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
"			sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
"\n"
"			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
"		}\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	totalSum = sorterSharedMemory[WG_SIZE*2-1];\n"
"	uint addValue = sorterSharedMemory[lIdx+127];\n"
"	return pData + uint4(addValue, addValue, addValue, addValue);\n"
"}\n"
"\n"
"void localPrefixSum128Dual( uint4 pData0, uint4 pData1, uint lIdx, \n"
"						   inout uint4 dataOut0, inout uint4 dataOut1, \n"
"						   inout uint totalSum0, inout uint totalSum1 )\n"
"{\n"
"/*\n"
"	dataOut0 = localPrefixSum128V( pData0, lIdx, totalSum0 );\n"
"	GROUP_LDS_BARRIER;\n"
"	dataOut1 = localPrefixSum128V( pData1, lIdx, totalSum1 );\n"
"	return;\n"
"*/\n"
"\n"
"	uint4 backup0 = pData0;\n"
"	uint4 backup1 = pData1;\n"
"\n"
"	{	// Prefix sum in a vector\n"
"		pData0 = prefixScanVector( pData0 );\n"
"		pData1 = prefixScanVector( pData1 );\n"
"	}\n"
"\n"
"	{	//	Set data\n"
"		sorterSharedMemory[lIdx] = 0;\n"
"		sorterSharedMemory[lIdx+WG_SIZE] = pData0.w;\n"
"		sorterSharedMemory[2*WG_SIZE+lIdx] = 0;\n"
"		sorterSharedMemory[2*WG_SIZE+lIdx+WG_SIZE] = pData1.w;\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"//	if( lIdx < 128 ) // todo. assert wg size is 128\n"
"	{	//	Prefix sum\n"
"		int blockIdx = lIdx/64;\n"
"		int groupIdx = lIdx%64;\n"
"		int idx = 2*groupIdx + (WG_SIZE+1) + (2*WG_SIZE)*blockIdx;\n"
"\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-1];\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-2];\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-4];\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-8];\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-16];\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-32];		\n"
"		sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
"\n"
"		sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	totalSum0 = sorterSharedMemory[WG_SIZE*2-1];\n"
"	{\n"
"		uint addValue = sorterSharedMemory[lIdx+127];\n"
"		dataOut0 = pData0 + uint4(addValue, addValue, addValue, addValue) - backup0;\n"
"	}\n"
"\n"
"	totalSum1 = sorterSharedMemory[2*WG_SIZE+WG_SIZE*2-1];\n"
"	{\n"
"		uint addValue = sorterSharedMemory[2*WG_SIZE+lIdx+127];\n"
"		dataOut1 = pData1 + uint4(addValue, addValue, addValue, addValue) - backup1;\n"
"	}\n"
"}\n"
"\n"
"uint4 extractKeys(uint4 data, uint targetKey)\n"
"{\n"
"	uint4 key;\n"
"	key.x = data.x == targetKey ? 1:0;\n"
"	key.y = data.y == targetKey ? 1:0;\n"
"	key.z = data.z == targetKey ? 1:0;\n"
"	key.w = data.w == targetKey ? 1:0;\n"
"	return key;\n"
"}\n"
"\n"
"uint4 extractKeysByBits(uint4 data, uint targetKey)\n"
"{\n"
"	uint4 key;\n"
"	uint mask = 1<<targetKey;\n"
"	key.x = (data.x & mask) >> targetKey;\n"
"	key.y = (data.y & mask) >> targetKey;\n"
"	key.z = (data.z & mask) >> targetKey;\n"
"	key.w = (data.w & mask) >> targetKey;\n"
"	return key;\n"
"}\n"
"\n"
"uint packKeys(uint lower, uint upper)\n"
"{\n"
"	return lower|(upper<<16);\n"
"}\n"
"\n"
"uint4 packKeys(uint4 lower, uint4 upper)\n"
"{\n"
"	return uint4( lower.x|(upper.x<<16), lower.y|(upper.y<<16), lower.z|(upper.z<<16), lower.w|(upper.w<<16) );\n"
"}\n"
"\n"
"uint extractLower( uint data )\n"
"{\n"
"	return data&0xffff;\n"
"}\n"
"\n"
"uint extractUpper( uint data )\n"
"{\n"
"	return (data>>16)&0xffff;\n"
"}\n"
"\n"
"uint4 extractLower( uint4 data )\n"
"{\n"
"	return uint4( data.x&0xffff, data.y&0xffff, data.z&0xffff, data.w&0xffff );\n"
"}\n"
"\n"
"uint4 extractUpper( uint4 data )\n"
"{\n"
"	return uint4( (data.x>>16)&0xffff, (data.y>>16)&0xffff, (data.z>>16)&0xffff, (data.w>>16)&0xffff );\n"
"}\n"
"\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void SortAndScatterKernel( DEFAULT_ARGS )        \n"
"{\n"
"	u32 lIdx = GET_LOCAL_IDX;\n"
"	u32 wgIdx = GET_GROUP_IDX;\n"
"\n"
"	if( lIdx < (NUM_BUCKET) )\n"
"	{\n"
"		localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx];\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	for(uint igroup=wgIdx*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx+1)*m_nBlocksPerGroup); igroup++)\n"
"	{\n"
"		u32 myHistogram;\n"
"		if( lIdx < (NUM_BUCKET) )\n"
"		{\n"
"			localPrefixSum[lIdx] = 0.f;\n"
"		}\n"
"\n"
"		u32 newOffset[4];\n"
"		KeyValuePair myData[4];\n"
"		{	//	read data\n"
"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
"			uint startAddress = igroup*numLocalElements + lIdx*4;\n"
"\n"
"			myData[0] = dataToSort[startAddress+0];\n"
"			myData[1] = dataToSort[startAddress+1];\n"
"			myData[2] = dataToSort[startAddress+2];\n"
"			myData[3] = dataToSort[startAddress+3];\n"
"\n"
"			newOffset[0] = newOffset[1] = newOffset[2] = newOffset[3] = 0;\n"
"		}\n"
"\n"
"		int localOffset = 0;\n"
"		uint4 b = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
"		for(uint targetKey=0; targetKey<(NUM_BUCKET); targetKey+=4)\n"
"		{\n"
"			uint4 key[4];\n"
"			uint keySet[2];\n"
"			{	//	pack 4\n"
"				uint4 scannedKey[4];\n"
"				key[0] = scannedKey[0] = extractKeys( b, targetKey+0 );\n"
"				key[1] = scannedKey[1] = extractKeys( b, targetKey+1 );\n"
"				key[2] = scannedKey[2] = extractKeys( b, targetKey+2 );\n"
"				key[3] = scannedKey[3] = extractKeys( b, targetKey+3 );\n"
"				{\n"
"					uint s[4];\n"
"					s[0] = prefixScanVectorEx( scannedKey[0] );\n"
"					s[1] = prefixScanVectorEx( scannedKey[1] );\n"
"					s[2] = prefixScanVectorEx( scannedKey[2] );\n"
"					s[3] = prefixScanVectorEx( scannedKey[3] );\n"
"					keySet[0] = packKeys( s[0], s[1] );\n"
"					keySet[1] = packKeys( s[2], s[3] );\n"
"				}\n"
"			}\n"
"\n"
"			uint dstAddressBase[4];\n"
"			{\n"
"\n"
"				uint totalSumPacked[2];\n"
"				uint dstAddressPacked[2];\n"
"\n"
"				localPrefixScan128Dual( keySet[0], keySet[1], lIdx, dstAddressPacked[0], dstAddressPacked[1], totalSumPacked[0], totalSumPacked[1] );\n"
"\n"
"				dstAddressBase[0] = extractLower( dstAddressPacked[0] );\n"
"				dstAddressBase[1] = extractUpper( dstAddressPacked[0] );\n"
"				dstAddressBase[2] = extractLower( dstAddressPacked[1] );\n"
"				dstAddressBase[3] = extractUpper( dstAddressPacked[1] );\n"
"\n"
"				uint4 histogram;\n"
"				histogram.x = extractLower(totalSumPacked[0]);\n"
"				histogram.y = extractUpper(totalSumPacked[0]);\n"
"				histogram.z = extractLower(totalSumPacked[1]);\n"
"				histogram.w = extractUpper(totalSumPacked[1]);\n"
"\n"
"				if( lIdx == targetKey + 0 ) myHistogram = histogram.x;\n"
"				else if( lIdx == targetKey + 1 ) myHistogram = histogram.y;\n"
"				else if( lIdx == targetKey + 2 ) myHistogram = histogram.z;\n"
"				else if( lIdx == targetKey + 3 ) myHistogram = histogram.w;\n"
"				\n"
"				uint histogramSum = prefixScanVectorEx( histogram );\n"
"\n"
"				if( lIdx == targetKey + 0 ) localPrefixSum[targetKey+0] = localOffset+histogram.x;\n"
"				else if( lIdx == targetKey + 1 ) localPrefixSum[targetKey+1] = localOffset+histogram.y;\n"
"				else if( lIdx == targetKey + 2 ) localPrefixSum[targetKey+2] = localOffset+histogram.z;\n"
"				else if( lIdx == targetKey + 3 ) localPrefixSum[targetKey+3] = localOffset+histogram.w;\n"
"\n"
"				localOffset += histogramSum;\n"
"			}\n"
"			\n"
"			GROUP_LDS_BARRIER;\n"
"\n"
"\n"
"			for(int ie=0; ie<4; ie++)\n"
"			{\n"
"				uint4 scannedKey = key[ie];\n"
"				prefixScanVectorEx( scannedKey );\n"
"\n"
"				uint offset = localPrefixSum[targetKey + ie] + dstAddressBase[ie];\n"
"				uint4 dstAddress = uint4( offset, offset, offset, offset ) + scannedKey;\n"
"\n"
"				newOffset[0] += dstAddress.x*key[ie].x;\n"
"				newOffset[1] += dstAddress.y*key[ie].y;\n"
"				newOffset[2] += dstAddress.z*key[ie].z;\n"
"				newOffset[3] += dstAddress.w*key[ie].w;\n"
"			}\n"
"		}\n"
"\n"
"		{	//	local scatter\n"
"			SET_LOCAL_SORT_DATA(newOffset[0], myData[0]);\n"
"			SET_LOCAL_SORT_DATA(newOffset[1], myData[1]);\n"
"			SET_LOCAL_SORT_DATA(newOffset[2], myData[2]);\n"
"			SET_LOCAL_SORT_DATA(newOffset[3], myData[3]);\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"\n"
"		{	//	write data\n"
"			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
"			{\n"
"				int dataIdx = 4*lIdx+i;\n"
"				KeyValuePair localData; GET_LOCAL_SORT_DATA( dataIdx, localData );\n"
"				int binIdx = (localData.key >> m_startBit) & 0xf;\n"
"				int groupOffset = localHistogramToCarry[binIdx];\n"
"				int myIdx = dataIdx - localPrefixSum[binIdx];\n"
"\n"
"				dataToSortOut[ groupOffset + myIdx ] = localData;\n"
"			}\n"
"		}\n"
"\n"
"		GROUP_LDS_BARRIER;\n"
"		if( lIdx < NUM_BUCKET )\n"
"		{\n"
"			localHistogramToCarry[lIdx] += myHistogram;\n"
"		}\n"
"		GROUP_LDS_BARRIER;\n"
"	}\n"
"}\n"
"\n"
"\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void SortAndScatterKernel1( DEFAULT_ARGS )\n"
"{\n"
"	u32 lIdx = GET_LOCAL_IDX;\n"
"	u32 wgIdx = GET_GROUP_IDX;\n"
"\n"
"	if( lIdx < (NUM_BUCKET) )\n"
"	{\n"
"		localHistogramToCarry[lIdx] = rHistogram[lIdx*m_nWorkGroupsToExecute + wgIdx.x];\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
"	{\n"
"		u32 myHistogram;\n"
"\n"
"		KeyValuePair myData[4];\n"
"		uint startAddrBlock;\n"
"		{	//	read data\n"
"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
"			startAddrBlock = lIdx*4;\n"
"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
"\n"
"			myData[0] = dataToSort[startAddress+0];\n"
"			myData[1] = dataToSort[startAddress+1];\n"
"			myData[2] = dataToSort[startAddress+2];\n"
"			myData[3] = dataToSort[startAddress+3];\n"
"		}\n"
"\n"
"		//	local sort\n"
"		for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)\n"
"		{\n"
"			uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);\n"
"			uint total;\n"
"			uint4 rankOfP = localPrefixSum128V( keys, lIdx, total );\n"
"			uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );\n"
"\n"
"			uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;\n"
"			\n"
"			GROUP_LDS_BARRIER;\n"
"\n"
"			SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );\n"
"			SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );\n"
"			SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );\n"
"			SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );\n"
"\n"
"			GROUP_LDS_BARRIER;\n"
"			\n"
"			GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );\n"
"			GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );\n"
"			GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );\n"
"			GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );\n"
"		}\n"
"\n"
"		{//	create histogram -> prefix sum\n"
"			if( lIdx < NUM_BUCKET )\n"
"			{\n"
"				localHistogram[lIdx] = 0;\n"
"				localHistogram[NUM_BUCKET+lIdx] = 0;\n"
"			}\n"
"			GROUP_LDS_BARRIER;\n"
"			uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
"			\n"
"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );\n"
"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );\n"
"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );\n"
"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );\n"
"			\n"
"			GROUP_LDS_BARRIER;\n"
"			\n"
"			uint hIdx = NUM_BUCKET+lIdx;\n"
"			if( lIdx < NUM_BUCKET )\n"
"			{\n"
"				myHistogram = localHistogram[hIdx];\n"
"			}\n"
"			GROUP_LDS_BARRIER;\n"
"	\n"
"			if( lIdx < NUM_BUCKET )\n"
"			{\n"
"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
"\n"
"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
"			}\n"
"\n"
"			GROUP_LDS_BARRIER;\n"
"		}\n"
"/*\n"
"		{//	write back\n"
"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
"			startAddrBlock = lIdx*4;\n"
"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
"\n"
"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
"			{\n"
"				dataToSortOut[ startAddress+ie ] = myData[ie];\n"
"			}\n"
"		}\n"
"*/\n"
"		{\n"
"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
"			{\n"
"				int dataIdx = startAddrBlock+ie;\n"
"				int binIdx = (myData[ie].key>>m_startBit)&0xf;\n"
"				int groupOffset = localHistogramToCarry[binIdx];\n"
"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
"				dataToSortOut[ groupOffset + myIdx ] = myData[ie];\n"
"			}\n"
"		}\n"
"		\n"
"		GROUP_LDS_BARRIER;\n"
"		if( lIdx < NUM_BUCKET )\n"
"		{\n"
"			localHistogramToCarry[lIdx] += myHistogram;\n"
"		}\n"
"		GROUP_LDS_BARRIER;\n"
"	\n"
"	}\n"
"}\n"
"\n"
"/*\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void SortAndScatterKernel1( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )\n"
"{\n"
"	if( lIdx.x < (NUM_BUCKET) )\n"
"	{\n"
"		localHistogramToCarry[lIdx.x] = rHistogram[lIdx.x*m_nWorkGroupsToExecute + gIdx.x];\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
"	{\n"
"		u32 myHistogram;\n"
"\n"
"		KeyValuePair myData[4];\n"
"		uint startAddrBlock;\n"
"		{	//	read data\n"
"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
"			startAddrBlock = lIdx.x*4;\n"
"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
"\n"
"			myData[0] = dataToSort[startAddress+0];\n"
"			myData[1] = dataToSort[startAddress+1];\n"
"			myData[2] = dataToSort[startAddress+2];\n"
"			myData[3] = dataToSort[startAddress+3];\n"
"		}\n"
"\n"
"		for(int ib=m_startBit; ib<m_startBit+BITS_PER_PASS; ib++)\n"
"		{\n"
"			uint4 keys = uint4(~(myData[0].key>>ib) & 0x1, ~(myData[1].key>>ib) & 0x1, ~(myData[2].key>>ib) & 0x1, ~(myData[3].key>>ib) & 0x1);\n"
"			uint total;\n"
"			uint4 rankOfP = localPrefixSum128V( keys, lIdx.x, total );\n"
"			uint4 rankOfN = uint4(startAddrBlock, startAddrBlock+1, startAddrBlock+2, startAddrBlock+3) - rankOfP + uint4( total, total, total, total );\n"
"\n"
"			uint4 myAddr = (keys==uint4(1,1,1,1))? rankOfP: rankOfN;\n"
"			\n"
"			GROUP_LDS_BARRIER;\n"
"\n"
"			SET_LOCAL_SORT_DATA( myAddr.x, myData[0] );\n"
"			SET_LOCAL_SORT_DATA( myAddr.y, myData[1] );\n"
"			SET_LOCAL_SORT_DATA( myAddr.z, myData[2] );\n"
"			SET_LOCAL_SORT_DATA( myAddr.w, myData[3] );\n"
"\n"
"			GROUP_LDS_BARRIER;\n"
"			\n"
"			GET_LOCAL_SORT_DATA( startAddrBlock+0, myData[0] );\n"
"			GET_LOCAL_SORT_DATA( startAddrBlock+1, myData[1] );\n"
"			GET_LOCAL_SORT_DATA( startAddrBlock+2, myData[2] );\n"
"			GET_LOCAL_SORT_DATA( startAddrBlock+3, myData[3] );\n"
"		}\n"
"		\n"
"		{//	create histogram -> prefix sum\n"
"			if( lIdx.x < NUM_BUCKET )\n"
"			{\n"
"				localHistogram[lIdx.x] = 0;\n"
"				localHistogram[NUM_BUCKET+lIdx.x] = 0;\n"
"			}\n"
"			GROUP_LDS_BARRIER;\n"
"			uint4 keys = uint4((myData[0].key>>m_startBit) & 0xf, (myData[1].key>>m_startBit) & 0xf, (myData[2].key>>m_startBit) & 0xf, (myData[3].key>>m_startBit) & 0xf);\n"
"			\n"
"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.x], 1 );\n"
"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.y], 1 );\n"
"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.z], 1 );\n"
"			InterlockedAdd( localHistogram[NUM_BUCKET+keys.w], 1 );\n"
"			\n"
"			GROUP_LDS_BARRIER;\n"
"			\n"
"			uint hIdx = NUM_BUCKET+lIdx.x;\n"
"			if( lIdx.x < NUM_BUCKET )\n"
"			{\n"
"				myHistogram = localHistogram[hIdx];\n"
"			}\n"
"			GROUP_LDS_BARRIER;\n"
"	\n"
"\n"
"			if( lIdx.x < NUM_BUCKET )\n"
"			{\n"
"				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
"\n"
"				localHistogram[hIdx] += localHistogram[hIdx-1];\n"
"				localHistogram[hIdx] += localHistogram[hIdx-2];\n"
"				localHistogram[hIdx] += localHistogram[hIdx-4];\n"
"				localHistogram[hIdx] += localHistogram[hIdx-8];\n"
"			}\n"
"\n"
"			GROUP_LDS_BARRIER;\n"
"		}\n"
"		{//	write back\n"
"			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
"			{\n"
"				int dataIdx = startAddrBlock+ie;\n"
"				int binIdx = (myData[ie].key>>m_startBit)&0xf;\n"
"				int groupOffset = localHistogramToCarry[binIdx];\n"
"				int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];\n"
"				\n"
"				dataToSortOut[ groupOffset + myIdx ] = myData[ie];\n"
"			}\n"
"		}\n"
"		\n"
"		GROUP_LDS_BARRIER;\n"
"		if( lIdx.x < NUM_BUCKET )\n"
"		{\n"
"			localHistogramToCarry[lIdx.x] += myHistogram;\n"
"		}\n"
"		GROUP_LDS_BARRIER;\n"
"	\n"
"	}\n"
"}\n"
"*/\n"
"\n"
"StructuredBuffer<KeyValuePair> dataToSort1 : register( t0 );\n"
"RWStructuredBuffer<u32> wHistogram1 : register(u0);\n"
"\n"
"#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx.x]\n"
"\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void StreamCountKernel( DEFAULT_ARGS )        \n"
"{\n"
"	u32 lIdx = GET_LOCAL_IDX;\n"
"	u32 wgIdx = GET_GROUP_IDX;\n"
"\n"
"	int myHistogram[NUM_BUCKET];\n"
"\n"
"	for(int i=0; i<NUM_BUCKET; i++)\n"
"	{\n"
"		MY_HISTOGRAM(i) = 0;\n"
"	}\n"
"\n"
"	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
"	{\n"
"		uint localKeys[4];\n"
"		{	//	read data\n"
"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
"\n"
"			uint4 localAddress = uint4(lIdx, lIdx, lIdx, lIdx)*4+uint4(0,1,2,3);\n"
"			uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;\n"
"\n"
"			KeyValuePair localData0 = dataToSort1[globalAddress.x];\n"
"			KeyValuePair localData1 = dataToSort1[globalAddress.y];\n"
"			KeyValuePair localData2 = dataToSort1[globalAddress.z];\n"
"			KeyValuePair localData3 = dataToSort1[globalAddress.w];\n"
"\n"
"			localKeys[0] = (localData0.key >> m_startBit) & 0xf;\n"
"			localKeys[1] = (localData1.key >> m_startBit) & 0xf;\n"
"			localKeys[2] = (localData2.key >> m_startBit) & 0xf;\n"
"			localKeys[3] = (localData3.key >> m_startBit) & 0xf;\n"
"		}\n"
"\n"
"		MY_HISTOGRAM( localKeys[0] )++;\n"
"		MY_HISTOGRAM( localKeys[1] )++;\n"
"		MY_HISTOGRAM( localKeys[2] )++;\n"
"		MY_HISTOGRAM( localKeys[3] )++;\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	{	//	reduce to 1\n"
"		if( lIdx < 64 )//WG_SIZE/2 )\n"
"		{\n"
"			for(int i=0; i<NUM_BUCKET/2; i++)\n"
"			{\n"
"				int idx = lIdx;\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
"			}\n"
"		}\n"
"		else if( lIdx < 128 )\n"
"		{\n"
"			for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)\n"
"			{\n"
"				int idx = lIdx-64;\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
"			}\n"
"		}\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	{	//	write data\n"
"		if( lIdx < NUM_BUCKET )\n"
"		{\n"
"			wHistogram1[ lIdx*m_nWorkGroupsToExecute + wgIdx.x ] = localHistogramMat[ lIdx*WG_SIZE+0 ];\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"/*\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void StreamCountKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )        \n"
"{\n"
"	int myHistogram[NUM_BUCKET];\n"
"\n"
"	for(int i=0; i<NUM_BUCKET; i++)\n"
"	{\n"
"		myHistogram[i] = 0;\n"
"	}\n"
"\n"
"	for(uint igroup=gIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(gIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
"	{\n"
"		uint localKeys[4];\n"
"		{	//	read data\n"
"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
"\n"
"			uint4 localAddress = uint4(lIdx.x, lIdx.x, lIdx.x, lIdx.x)*4+uint4(0,1,2,3);\n"
"			uint4 globalAddress = uint4(igroup,igroup,igroup,igroup)*numLocalElements + localAddress;\n"
"\n"
"			KeyValuePair localData0 = dataToSort1[globalAddress.x];\n"
"			KeyValuePair localData1 = dataToSort1[globalAddress.y];\n"
"			KeyValuePair localData2 = dataToSort1[globalAddress.z];\n"
"			KeyValuePair localData3 = dataToSort1[globalAddress.w];\n"
"\n"
"			localKeys[0] = (localData0.key >> m_startBit) & 0xf;\n"
"			localKeys[1] = (localData1.key >> m_startBit) & 0xf;\n"
"			localKeys[2] = (localData2.key >> m_startBit) & 0xf;\n"
"			localKeys[3] = (localData3.key >> m_startBit) & 0xf;\n"
"		}\n"
"\n"
"		myHistogram[ localKeys[0] ]++;\n"
"		myHistogram[ localKeys[1] ]++;\n"
"		myHistogram[ localKeys[2] ]++;\n"
"		myHistogram[ localKeys[3] ]++;\n"
"	}\n"
"\n"
"	{	//	move to shared\n"
"		for(int i=0; i<NUM_BUCKET; i++)\n"
"		{\n"
"			localHistogramMat[i*WG_SIZE+lIdx.x] = myHistogram[i];\n"
"		}\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	{	//	reduce to 1\n"
"		if( lIdx.x < 64 )//WG_SIZE/2 )\n"
"		{\n"
"			for(int i=0; i<NUM_BUCKET/2; i++)\n"
"			{\n"
"				int idx = lIdx.x;\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
"			}\n"
"		}\n"
"		else if( lIdx.x < 128 )\n"
"		{\n"
"			for(int i=NUM_BUCKET/2; i<NUM_BUCKET; i++)\n"
"			{\n"
"				int idx = lIdx.x-64;\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+64];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+32];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+16];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+8];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+4];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+2];\n"
"				localHistogramMat[i*WG_SIZE+idx] += localHistogramMat[i*WG_SIZE+idx+1];\n"
"			}\n"
"		}\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	{	//	write data\n"
"		if( lIdx.x < NUM_BUCKET )\n"
"		{\n"
"			wHistogram1[ lIdx.x*m_nWorkGroupsToExecute + gIdx.x ] = localHistogramMat[ lIdx.x*WG_SIZE+0 ];\n"
"		}\n"
"	}\n"
"}\n"
"*/\n"
"\n"
"/*\n"
"//	for MAX_WG_SIZE 20\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void PrefixScanKernel( uint3 gIdx : SV_GroupID, uint3 lIdx : SV_GroupThreadID )        \n"
"{\n"
"	uint4 myData = uint4(0,0,0,0);\n"
"	if( 4*lIdx.x+0 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
"		myData.x = wHistogram1[4*lIdx.x+0];\n"
"	if( 4*lIdx.x+1 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
"		myData.y = wHistogram1[4*lIdx.x+1];\n"
"	if( 4*lIdx.x+2 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
"		myData.z = wHistogram1[4*lIdx.x+2];\n"
"	if( 4*lIdx.x+3 < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
"		myData.w = wHistogram1[4*lIdx.x+3];\n"
"\n"
"	uint totalSum;\n"
"\n"
"	uint4 scanned = localPrefixSum128V( myData, lIdx.x, totalSum );\n"
"\n"
"	wHistogram1[4*lIdx.x+0] = scanned.x;\n"
"	wHistogram1[4*lIdx.x+1] = scanned.y;\n"
"	wHistogram1[4*lIdx.x+2] = scanned.z;\n"
"	wHistogram1[4*lIdx.x+3] = scanned.w;\n"
"}\n"
"*/\n"
"\n"
"//	for MAX_WG_SIZE 80\n"
"//	can hold up to WG_SIZE*12 (128*12 > 80*16 )\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void PrefixScanKernel( DEFAULT_ARGS )\n"
"{\n"
"	u32 lIdx = GET_LOCAL_IDX;\n"
"	u32 wgIdx = GET_GROUP_IDX;\n"
"\n"
"	uint data[12] = {0,0,0,0,0,0,0,0,0,0,0,0};\n"
"	for(int i=0; i<12; i++)\n"
"	{\n"
"		if( int(12*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
"			data[i] = wHistogram1[12*lIdx+i];\n"
"	}\n"
"\n"
"	uint4 myData = uint4(0,0,0,0);\n"
"	myData.x = data[0] + data[1];\n"
"	myData.y = data[2] + data[3];\n"
"	myData.z = data[4] + data[5];\n"
"	myData.w = data[6] + data[7];\n"
"\n"
"\n"
"	uint totalSum;\n"
"	uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );\n"
"\n"
"	data[11] = scanned.w + data[9] + data[10];\n"
"	data[10] = scanned.w + data[9];\n"
"	data[9] = scanned.w;\n"
"	data[8] = scanned.z + data[6] + data[7];\n"
"	data[7] = scanned.z + data[6];\n"
"	data[6] = scanned.z;\n"
"	data[5] = scanned.y + data[3] + data[4];\n"
"	data[4] = scanned.y + data[3];\n"
"	data[3] = scanned.y;\n"
"	data[2] = scanned.x + data[0] + data[1];\n"
"	data[1] = scanned.x + data[0];\n"
"	data[0] = scanned.x;\n"
"\n"
"	for(int i=0; i<12; i++)\n"
"	{\n"
"		wHistogram1[12*lIdx+i] = data[i];\n"
"	}\n"
"}\n"
"/*\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void PrefixScanKernel( DEFAULT_ARGS )\n"
"{\n"
"	u32 lIdx = GET_LOCAL_IDX;\n"
"	u32 wgIdx = GET_GROUP_IDX;\n"
"\n"
"	uint data[8] = {0,0,0,0,0,0,0,0};\n"
"	for(int i=0; i<8; i++)\n"
"	{\n"
"		if( int(8*lIdx+i) < NUM_BUCKET*m_nWorkGroupsToExecute )\n"
"			data[i] = wHistogram1[8*lIdx+i];\n"
"	}\n"
"\n"
"	uint4 myData = uint4(0,0,0,0);\n"
"	myData.x = data[0] + data[1];\n"
"	myData.y = data[2] + data[3];\n"
"	myData.z = data[4] + data[5];\n"
"	myData.w = data[6] + data[7];\n"
"\n"
"\n"
"	uint totalSum;\n"
"	uint4 scanned = localPrefixSum128V( myData, lIdx, totalSum );\n"
"\n"
"	data[7] = scanned.w + data[6];\n"
"	data[6] = scanned.w;// + data[5];\n"
"	data[5] = scanned.z + data[4];\n"
"	data[4] = scanned.z;// + data[3];\n"
"	data[3] = scanned.y + data[2];\n"
"	data[2] = scanned.y;// + data[1];\n"
"	data[1] = scanned.x + data[0];\n"
"	data[0] = scanned.x;\n"
"\n"
"	for(int i=0; i<8; i++)\n"
"	{\n"
"		wHistogram1[8*lIdx+i] = data[i];\n"
"	}\n"
"}\n"
"*/\n"
"\n"
"\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void CopyKernel( DEFAULT_ARGS )\n"
"{\n"
"	u32 lIdx = GET_LOCAL_IDX;\n"
"	u32 wgIdx = GET_GROUP_IDX;\n"
"\n"
"	for(uint igroup=wgIdx.x*m_nBlocksPerGroup; igroup<min2(m_totalBlocks,(wgIdx.x+1)*m_nBlocksPerGroup); igroup++)\n"
"	{\n"
"		KeyValuePair myData[4];\n"
"		uint startAddrBlock;\n"
"		{	//	read data\n"
"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
"			startAddrBlock = lIdx*4;\n"
"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
"\n"
"			myData[0] = dataToSort[startAddress+0];\n"
"			myData[1] = dataToSort[startAddress+1];\n"
"			myData[2] = dataToSort[startAddress+2];\n"
"			myData[3] = dataToSort[startAddress+3];\n"
"		}\n"
"\n"
"		{\n"
"			int numLocalElements = WG_SIZE*ELEMENTS_PER_WORK_ITEM;\n"
"			uint startAddress = igroup*numLocalElements + startAddrBlock;\n"
"\n"
"			dataToSortOut[startAddress+0] = myData[0];\n"
"			dataToSortOut[startAddress+1] = myData[1];\n"
"			dataToSortOut[startAddress+2] = myData[2];\n"
"			dataToSortOut[startAddress+3] = myData[3];\n"
"		}\n"
"	}\n"
"}\n"
;
